In [1]:
%pylab inline
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import pickle
import itertools
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools

import warnings
from itertools import product

warnings.filterwarnings('ignore')
Populating the interactive namespace from numpy and matplotlib
In [2]:
import statsmodels
statsmodels.__version__
Out[2]:
'0.9.0'
In [3]:
def invboxcox(y,lmbda):
    if lmbda == 0:
        return(np.exp(y))
    else:
        return(np.exp(np.log(lmbda*y+1)/lmbda))
In [2]:
mypath = "H:/Yandex machine learning/finall course coursera/csv_yellow_cab_2015/reg_bin_stat/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

print(onlyfiles)
['reg_bin_stat_2015-06.pkl', 'reg_bin_stat_2015-01.pkl', 'reg_bin_stat_2015-02.pkl', 'reg_bin_stat_2015-03.pkl', 'reg_bin_stat_2015-04.pkl', 'reg_bin_stat_2015-05.pkl']
In [3]:
ESB_reg = 1231
month = []
In [4]:
for file in onlyfiles[1:] + [onlyfiles[0]]:
    with open(mypath + file, "rb") as f:
        region_bined_stat = pickle.load(f)
        month.append(region_bined_stat[ESB_reg-1, :])

for i in month:
    print(len(i), end=" ")
744 672 744 720 744 720 
In [5]:
month_join = list(itertools.chain.from_iterable(month))
time = pd.date_range('2015 Jan 1 00:00:00', periods = len(month_join), freq = 'h')
taxi_data = pd.DataFrame({"taxi_call_num" : month_join}, index = [time])

taxi_data.head()
Out[5]:
taxi_call_num
2015-01-01 00:00:00 1125.0
2015-01-01 01:00:00 922.0
2015-01-01 02:00:00 767.0
2015-01-01 03:00:00 774.0
2015-01-01 04:00:00 548.0
In [9]:
print(taxi_data.index.values.shape)
print(taxi_data.taxi_call_num.values.shape)
(4344,)
(4344,)
In [16]:
fig, axes = plt.subplots(figsize = (80, 8))
axes.title.set_size(40)
taxi_data.plot(color="green", title="timeserie", fontsize=25, ax = axes)
frequency = 24
plt.xticks(time[::24])
plt.grid()

#double click on image makes it larger
plot first three month separately
In [21]:
#num of hours in each month
month_len = np.array([0, 744, 672, 744, 720, 744, 720])

for i, c in enumerate(["January", "February", "March"]):
    taxi_data[int(sum(month_len[:i+1])) : int(sum(month_len[:i+2]))].plot(figsize=(20, 5), title=c, color="green", fontsize=16, grid=True)
Dickey-Fuller stationarity test + time-serie STL-decomposition:
In [27]:
plt.figure(figsize(60,10))
sm.tsa.seasonal_decompose(taxi_data.taxi_call_num).plot()
print("Dickey-Fuller stationarity test: p=%f" % sm.tsa.stattools.adfuller(taxi_data.taxi_call_num)[1])

#double click on image makes it larger
Dickey-Fuller stationarity test: p=0.000000
<Figure size 4320x720 with 0 Axes>
half year time serie seems to be non-dispersive but it has distinctly visible monthly seasonality. Рypothesis of nonstationarity is rejected with the Dickey-Fuller criterion.
plot the same for january month
In [28]:
plt.figure(figsize(15,8))
sm.tsa.seasonal_decompose(taxi_data.taxi_call_num[0:744]).plot()
print("Dickey-Fuller stationarity test: p=%f" % sm.tsa.stattools.adfuller(taxi_data.taxi_call_num[0:744])[1])
Dickey-Fuller stationarity test: p=0.126987
<Figure size 1080x576 with 0 Axes>
weekends may be determined from the trend plot and from seasonal graph it is obvious that each day has two maximum values - morning and evening and two minimums - day and night. Night minimum is much deeper.
In [10]:
plt.figure(figsize=(15,4))
plt.plot(taxi_data.index, taxi_data.taxi_call_num)
""" moving average """
plt.plot(taxi_data.index, taxi_data.taxi_call_num.rolling(window=24).mean(), 'r')
Out[10]:
[<matplotlib.lines.Line2D at 0x7d1e320>]
From 6-month time serie data it may be concluded that taxi trips have weekly and daily periodicity and also periodicity within one day - night, morning, day, evening. Sunday is most "silent" day then amount of trips starts to grow getting maximum values in the middle of week with a subsequent decrease.
Weekly and daily periodicity will be accounted by making a lineat regression model on fourier serie features with periods 7*24 hours (week) и 24 hours (day); morning/evening periodicity should be taken into account by the ARIMA model.
T=168 t = 24 W(i) = T/2, T/3, T/4 ... w(i) = t/2, t/3, t/4 ...
lets take first 100 garmonics
In [13]:
T = 168
t = 28
time = np.arange(0, taxi_data.shape[0])
regression_taxi = taxi_data.copy()
#regression_taxi.loc[regression_taxi["taxi_call_num"] == 0] = 1
#regression_taxi['log_taxi_call_num'], lmbda = stats.boxcox(regression_taxi.taxi_call_num)
In [20]:
#t=28
#tt=48
#ttt = 14
for w in range(1, 100):
    regression_taxi["sin_sin_w_" + str(w)] = np.sin(2*np.pi*w*time/T) * np.sin(2*np.pi*w*time/t) 
    regression_taxi["cos_cos_w_" + str(w)] = np.cos(2*np.pi*w*time/T) * np.cos(2*np.pi*w*time/t)
    
    regression_taxi["sin_cos_w_" + str(w)] = np.sin(2*np.pi*w*time/T) * np.cos(2*np.pi*w*time/t)
    regression_taxi["cos_sin_w_" + str(w)] = np.cos(2*np.pi*w*time/T) * np.sin(2*np.pi*w*time/t) 
In [21]:
regression_taxi.head()
Out[21]:
taxi_call_num sin_sin_w_1 cos_cos_w_1 sin_cos_w_1 cos_sin_w_1 sin_sin_w_2 cos_cos_w_2 sin_cos_w_2 cos_sin_w_2 sin_sin_w_3 ... sin_cos_w_97 cos_sin_w_97 sin_sin_w_98 cos_cos_w_98 sin_cos_w_98 cos_sin_w_98 sin_sin_w_99 cos_cos_w_99 sin_cos_w_99 cos_sin_w_99
2015-01-01 00:00:00 1125.0 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000e+00 1.000000e+00 0.000000 0.000000e+00 0.000000 1.000000 0.000000 0.000000
2015-01-01 01:00:00 922.0 0.008320 0.974246 0.036454 0.222365 0.032424 0.898450 0.067329 0.432671 0.069809 ... 0.455553 -0.196734 -4.286264e-16 8.660254e-01 0.500000 -7.424027e-16 0.118388 0.825495 0.518693 0.188414
2015-01-01 02:00:00 767.0 0.032424 0.898450 0.067329 0.432671 0.116526 0.616526 0.092926 0.773099 0.216942 ... 0.744415 -0.244415 -1.484805e-15 5.000000e-01 0.866025 -8.572528e-16 0.390916 0.390916 0.811745 0.188255
2015-01-01 03:00:00 774.0 0.069809 0.776915 0.087537 0.619569 0.216942 0.216942 0.049516 0.950484 0.297571 ... 0.776915 -0.069809 -1.322990e-14 4.286264e-16 1.000000 -5.670684e-30 0.619569 -0.087537 0.776915 -0.069809
2015-01-01 04:00:00 548.0 0.116526 0.616526 0.092926 0.773099 0.287365 -0.212635 -0.065589 0.931615 0.188255 ... 0.580390 0.285635 -2.969611e-15 -5.000000e-01 0.866025 1.714506e-15 0.611260 -0.388740 0.487464 -0.487464

5 rows × 397 columns

In [22]:
regression_taxi.shape
Out[22]:
(4344, 397)
In [23]:
models = []
for w in range(1, 100):
    m1 = smf.ols('taxi_call_num ~ ' + ' + '.join(list(regression_taxi.columns)[1:4*w+1]), data=regression_taxi)
    model = m1.fit(cov_type='HC1')
    models.append(model)
    
# fitted.summary(); fitted.params; models[-1].rsquared; params.Intercept  
plot r-squared (proportion of the variance in the dependent variable that is predictable from the independent variable) dependence on number of fourier components and similar dependence but for mean squared error of the residuals (sum of squared residuals divided by the residual degrees of freedom)
Here we used slighly different features - smth. like "modulation" and take one of the periods not equal to daily period but larger - 28 hours. Obtained result is quite good - up to 90 % of predictable variance.
In [27]:
import warnings
warnings.filterwarnings('ignore')

fig1 = figure(figsize=(15,5))

ax1 = fig1.add_subplot(111)
line1 = ax1.plot(range(1, 100), list(map(lambda x: x.rsquared, models)), color="blue", label="r-square")
ylabel("r-squared")

ax2 = fig1.add_subplot(111, sharex=ax1, frameon=False)
line2 = ax2.plot(range(1, 100), list(map(lambda x: x.mse_resid, models)), color="green", label="resid")
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
ylabel("mse_resid")

legend((line1, line2), ("1", "2"))
plt.xlabel("number of fourier components")
legend()
grid()
show() 
plot dependence of correlation of first lag with 168, 24 and 14 lag on number of fourier components
In [28]:
statsmodels.tsa.stattools.acf(x=models[3].resid, nlags=168)[168]
Out[28]:
0.6599616033184009
In [30]:
plt.figure(figsize=(15,6))
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[168] for model in models], color="blue", label='week')
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[24] for model in models], color="green", label='day')
plt.plot(range(1, 100), [statsmodels.tsa.stattools.acf(x=model.resid, nlags=168)[14] for model in models], color="red", label='14 hours')
plt.legend()
plt.grid()
plt.xlabel("number of fourier components")
plt.ylabel("autocorellation with lag")
Out[30]:
Text(0,0.5,'autocorellation with lag')
80 components is enought for achiving good performance and quite high decorelation. Фs option is possible to choose 67 components as tradeoff between day and week decorelation
In [31]:
models[80].summary()
Out[31]:
OLS Regression Results
Dep. Variable: taxi_call_num R-squared: 0.890
Model: OLS Adj. R-squared: 0.881
Method: Least Squares F-statistic: 1.725
Date: Thu, 05 Jul 2018 Prob (F-statistic): 1.91e-07
Time: 22:43:34 Log-Likelihood: -27076.
No. Observations: 4344 AIC: 5.477e+04
Df Residuals: 4035 BIC: 5.674e+04
Df Model: 308
Covariance Type: HC1
coef std err z P>|z| [0.025 0.975]
Intercept -1.253e+09 1.25e+09 -1.003 0.316 -3.7e+09 1.2e+09
sin_sin_w_1 4.873e+12 4.48e+12 1.087 0.277 -3.91e+12 1.37e+13
cos_cos_w_1 -4.873e+12 4.48e+12 -1.087 0.277 -1.37e+13 3.91e+12
sin_cos_w_1 2.332e+12 4.09e+12 0.570 0.569 -5.69e+12 1.04e+13
cos_sin_w_1 2.332e+12 4.09e+12 0.570 0.569 -5.69e+12 1.04e+13
sin_sin_w_2 3.698e+12 6.34e+12 0.583 0.560 -8.73e+12 1.61e+13
cos_cos_w_2 -3.698e+12 6.34e+12 -0.583 0.560 -1.61e+13 8.73e+12
sin_cos_w_2 -1.843e+12 6.37e+12 -0.289 0.772 -1.43e+13 1.06e+13
cos_sin_w_2 -1.843e+12 6.37e+12 -0.289 0.772 -1.43e+13 1.06e+13
sin_sin_w_3 4.625e+12 3.91e+12 1.184 0.237 -3.03e+12 1.23e+13
cos_cos_w_3 -4.625e+12 3.91e+12 -1.184 0.237 -1.23e+13 3.03e+12
sin_cos_w_3 1.938e+12 4.07e+12 0.476 0.634 -6.04e+12 9.91e+12
cos_sin_w_3 1.938e+12 4.07e+12 0.476 0.634 -6.04e+12 9.91e+12
sin_sin_w_4 -5.309e+12 1e+13 -0.529 0.597 -2.5e+13 1.44e+13
cos_cos_w_4 5.309e+12 1e+13 0.529 0.597 -1.44e+13 2.5e+13
sin_cos_w_4 -1.784e+13 1.04e+13 -1.710 0.087 -3.83e+13 2.61e+12
cos_sin_w_4 -1.784e+13 1.04e+13 -1.710 0.087 -3.83e+13 2.61e+12
sin_sin_w_5 7.274e+11 7.23e+12 0.101 0.920 -1.34e+13 1.49e+13
cos_cos_w_5 -7.274e+11 7.23e+12 -0.101 0.920 -1.49e+13 1.34e+13
sin_cos_w_5 -1.721e+12 6.99e+12 -0.246 0.806 -1.54e+13 1.2e+13
cos_sin_w_5 -1.721e+12 6.99e+12 -0.246 0.806 -1.54e+13 1.2e+13
sin_sin_w_6 4.506e+12 6.76e+12 0.667 0.505 -8.74e+12 1.78e+13
cos_cos_w_6 -4.506e+12 6.76e+12 -0.667 0.505 -1.78e+13 8.74e+12
sin_cos_w_6 7.185e+11 7.2e+12 0.100 0.921 -1.34e+13 1.48e+13
cos_sin_w_6 7.185e+11 7.2e+12 0.100 0.921 -1.34e+13 1.48e+13
sin_sin_w_7 9.279e+11 8.65e+12 0.107 0.915 -1.6e+13 1.79e+13
cos_cos_w_7 5.377e+12 8.28e+12 0.649 0.516 -1.08e+13 2.16e+13
sin_cos_w_7 -9.208e+12 8.06e+12 -1.143 0.253 -2.5e+13 6.58e+12
cos_sin_w_7 -9.395e+12 8.06e+12 -1.165 0.244 -2.52e+13 6.41e+12
sin_sin_w_8 1.57e+12 1.37e+12 1.144 0.252 -1.12e+12 4.26e+12
cos_cos_w_8 -1.57e+12 1.37e+12 -1.144 0.252 -4.26e+12 1.12e+12
sin_cos_w_8 -1.304e+12 1.37e+12 -0.950 0.342 -3.99e+12 1.39e+12
cos_sin_w_8 -1.304e+12 1.37e+12 -0.950 0.342 -3.99e+12 1.39e+12
sin_sin_w_9 2.811e+12 3.85e+12 0.729 0.466 -4.74e+12 1.04e+13
cos_cos_w_9 -2.811e+12 3.85e+12 -0.729 0.466 -1.04e+13 4.74e+12
sin_cos_w_9 1.125e+12 3.95e+12 0.285 0.776 -6.61e+12 8.86e+12
cos_sin_w_9 1.125e+12 3.95e+12 0.285 0.776 -6.61e+12 8.86e+12
sin_sin_w_10 1.585e+12 5.38e+12 0.295 0.768 -8.95e+12 1.21e+13
cos_cos_w_10 -1.585e+12 5.38e+12 -0.295 0.768 -1.21e+13 8.95e+12
sin_cos_w_10 -7.432e+12 5.08e+12 -1.463 0.143 -1.74e+13 2.52e+12
cos_sin_w_10 -7.432e+12 5.08e+12 -1.463 0.143 -1.74e+13 2.52e+12
sin_sin_w_11 3.589e+12 3.51e+12 1.023 0.306 -3.29e+12 1.05e+13
cos_cos_w_11 -3.589e+12 3.51e+12 -1.023 0.306 -1.05e+13 3.29e+12
sin_cos_w_11 2.216e+12 3.69e+12 0.601 0.548 -5.01e+12 9.45e+12
cos_sin_w_11 2.216e+12 3.69e+12 0.601 0.548 -5.01e+12 9.45e+12
sin_sin_w_12 3.333e+09 9.8e+09 0.340 0.734 -1.59e+10 2.25e+10
cos_cos_w_12 -3.333e+09 9.8e+09 -0.340 0.734 -2.25e+10 1.59e+10
sin_cos_w_12 3.084e+12 3.1e+12 0.994 0.320 -3e+12 9.16e+12
cos_sin_w_12 3.084e+12 3.1e+12 0.994 0.320 -3e+12 9.16e+12
sin_sin_w_13 -5.302e+12 3.48e+12 -1.522 0.128 -1.21e+13 1.53e+12
cos_cos_w_13 5.302e+12 3.48e+12 1.522 0.128 -1.53e+12 1.21e+13
sin_cos_w_13 2.182e+12 3.56e+12 0.613 0.540 -4.8e+12 9.16e+12
cos_sin_w_13 2.182e+12 3.56e+12 0.613 0.540 -4.8e+12 9.16e+12
sin_sin_w_14 -7.927e+12 4.39e+12 -1.808 0.071 -1.65e+13 6.68e+11
cos_cos_w_14 1.778e+12 5.92e+12 0.301 0.764 -9.82e+12 1.34e+13
sin_cos_w_14 -8.981e+12 5.75e+12 -1.563 0.118 -2.02e+13 2.28e+12
cos_sin_w_14 2.994e+12 4.06e+12 0.738 0.461 -4.96e+12 1.1e+13
sin_sin_w_15 -4.192e+12 2.88e+12 -1.454 0.146 -9.84e+12 1.46e+12
cos_cos_w_15 4.192e+12 2.88e+12 1.454 0.146 -1.46e+12 9.84e+12
sin_cos_w_15 -1.771e+12 2.98e+12 -0.594 0.552 -7.61e+12 4.07e+12
cos_sin_w_15 -1.771e+12 2.98e+12 -0.594 0.552 -7.61e+12 4.07e+12
sin_sin_w_16 1.271e+12 1.15e+12 1.108 0.268 -9.78e+11 3.52e+12
cos_cos_w_16 -1.271e+12 1.15e+12 -1.108 0.268 -3.52e+12 9.78e+11
sin_cos_w_16 8.97e+11 1.13e+12 0.793 0.428 -1.32e+12 3.11e+12
cos_sin_w_16 8.97e+11 1.13e+12 0.793 0.428 -1.32e+12 3.11e+12
sin_sin_w_17 3.334e+12 3.41e+12 0.977 0.329 -3.36e+12 1e+13
cos_cos_w_17 -3.334e+12 3.41e+12 -0.977 0.329 -1e+13 3.36e+12
sin_cos_w_17 -3.998e+12 3.15e+12 -1.270 0.204 -1.02e+13 2.17e+12
cos_sin_w_17 -3.998e+12 3.15e+12 -1.270 0.204 -1.02e+13 2.17e+12
sin_sin_w_18 -4.077e+12 3.13e+12 -1.301 0.193 -1.02e+13 2.07e+12
cos_cos_w_18 4.077e+12 3.13e+12 1.301 0.193 -2.07e+12 1.02e+13
sin_cos_w_18 2.455e+12 2.98e+12 0.823 0.410 -3.39e+12 8.3e+12
cos_sin_w_18 2.455e+12 2.98e+12 0.823 0.410 -3.39e+12 8.3e+12
sin_sin_w_19 2.084e+12 2.76e+12 0.754 0.451 -3.33e+12 7.5e+12
cos_cos_w_19 -2.084e+12 2.76e+12 -0.754 0.451 -7.5e+12 3.33e+12
sin_cos_w_19 -1.704e+12 2.66e+12 -0.640 0.522 -6.93e+12 3.52e+12
cos_sin_w_19 -1.704e+12 2.66e+12 -0.640 0.522 -6.93e+12 3.52e+12
sin_sin_w_20 2.873e+12 2.6e+12 1.105 0.269 -2.22e+12 7.97e+12
cos_cos_w_20 -2.873e+12 2.6e+12 -1.105 0.269 -7.97e+12 2.22e+12
sin_cos_w_20 5.265e+10 2.68e+12 0.020 0.984 -5.19e+12 5.3e+12
cos_sin_w_20 5.265e+10 2.68e+12 0.020 0.984 -5.19e+12 5.3e+12
sin_sin_w_21 -4.34e+09 3.93e+12 -0.001 0.999 -7.71e+12 7.7e+12
cos_cos_w_21 1.128e+12 3.58e+12 0.315 0.753 -5.89e+12 8.15e+12
sin_cos_w_21 -2.071e+12 3.62e+12 -0.572 0.568 -9.17e+12 5.03e+12
cos_sin_w_21 -9.457e+11 3.93e+12 -0.240 0.810 -8.66e+12 6.77e+12
sin_sin_w_22 -4.505e+12 2.21e+12 -2.034 0.042 -8.84e+12 -1.65e+11
cos_cos_w_22 4.505e+12 2.21e+12 2.034 0.042 1.65e+11 8.84e+12
sin_cos_w_22 -2.27e+12 1.99e+12 -1.144 0.253 -6.16e+12 1.62e+12
cos_sin_w_22 -2.27e+12 1.99e+12 -1.144 0.253 -6.16e+12 1.62e+12
sin_sin_w_23 6.734e+12 2.28e+12 2.950 0.003 2.26e+12 1.12e+13
cos_cos_w_23 -6.734e+12 2.28e+12 -2.950 0.003 -1.12e+13 -2.26e+12
sin_cos_w_23 -3.314e+12 2.14e+12 -1.548 0.122 -7.51e+12 8.81e+11
cos_sin_w_23 -3.314e+12 2.14e+12 -1.548 0.122 -7.51e+12 8.81e+11
sin_sin_w_24 3.865e+09 3.64e+09 1.061 0.289 -3.28e+09 1.1e+10
cos_cos_w_24 -3.865e+09 3.64e+09 -1.061 0.289 -1.1e+10 3.28e+09
sin_cos_w_24 3.442e+11 3.13e+11 1.100 0.271 -2.69e+11 9.57e+11
cos_sin_w_24 3.442e+11 3.13e+11 1.100 0.271 -2.69e+11 9.57e+11
sin_sin_w_25 -2.409e+12 2.18e+12 -1.106 0.269 -6.68e+12 1.86e+12
cos_cos_w_25 2.409e+12 2.18e+12 1.106 0.269 -1.86e+12 6.68e+12
sin_cos_w_25 2.364e+12 2.12e+12 1.114 0.265 -1.8e+12 6.52e+12
cos_sin_w_25 2.364e+12 2.12e+12 1.114 0.265 -1.8e+12 6.52e+12
sin_sin_w_26 -4.774e+11 2.25e+12 -0.212 0.832 -4.88e+12 3.93e+12
cos_cos_w_26 4.774e+11 2.25e+12 0.212 0.832 -3.93e+12 4.88e+12
sin_cos_w_26 8.685e+11 2.1e+12 0.413 0.680 -3.26e+12 4.99e+12
cos_sin_w_26 8.685e+11 2.1e+12 0.413 0.680 -3.26e+12 4.99e+12
sin_sin_w_27 1.728e+12 1.92e+12 0.898 0.369 -2.04e+12 5.5e+12
cos_cos_w_27 -1.728e+12 1.92e+12 -0.898 0.369 -5.5e+12 2.04e+12
sin_cos_w_27 -2.49e+12 2.05e+12 -1.212 0.226 -6.52e+12 1.54e+12
cos_sin_w_27 -2.49e+12 2.05e+12 -1.212 0.226 -6.52e+12 1.54e+12
sin_sin_w_28 3.529e+12 2.24e+12 1.575 0.115 -8.63e+11 7.92e+12
cos_cos_w_28 -2.312e+12 1.01e+13 -0.230 0.818 -2.21e+13 1.74e+13
sin_cos_w_28 1.675e+13 1.02e+13 1.639 0.101 -3.28e+12 3.68e+13
cos_sin_w_28 1.426e+12 2.26e+12 0.630 0.528 -3.01e+12 5.86e+12
sin_sin_w_29 1.113e+11 1.94e+12 0.057 0.954 -3.69e+12 3.91e+12
cos_cos_w_29 -1.113e+11 1.94e+12 -0.057 0.954 -3.91e+12 3.69e+12
sin_cos_w_29 -1.084e+12 1.8e+12 -0.603 0.547 -4.61e+12 2.44e+12
cos_sin_w_29 -1.084e+12 1.8e+12 -0.603 0.547 -4.61e+12 2.44e+12
sin_sin_w_30 8.743e+11 1.69e+12 0.518 0.604 -2.43e+12 4.18e+12
cos_cos_w_30 -8.743e+11 1.69e+12 -0.518 0.604 -4.18e+12 2.43e+12
sin_cos_w_30 -4.312e+11 1.65e+12 -0.261 0.794 -3.67e+12 2.8e+12
cos_sin_w_30 -4.312e+11 1.65e+12 -0.261 0.794 -3.67e+12 2.8e+12
sin_sin_w_31 -1.298e+12 1.79e+12 -0.723 0.470 -4.82e+12 2.22e+12
cos_cos_w_31 1.298e+12 1.79e+12 0.723 0.470 -2.22e+12 4.82e+12
sin_cos_w_31 2.699e+12 1.83e+12 1.477 0.140 -8.83e+11 6.28e+12
cos_sin_w_31 2.699e+12 1.83e+12 1.477 0.140 -8.83e+11 6.28e+12
sin_sin_w_32 1.84e+12 1.64e+12 1.120 0.263 -1.38e+12 5.06e+12
cos_cos_w_32 -1.84e+12 1.64e+12 -1.120 0.263 -5.06e+12 1.38e+12
sin_cos_w_32 -1.73e+12 1.66e+12 -1.043 0.297 -4.98e+12 1.52e+12
cos_sin_w_32 -1.73e+12 1.66e+12 -1.043 0.297 -4.98e+12 1.52e+12
sin_sin_w_33 4.705e+11 1.7e+12 0.278 0.781 -2.85e+12 3.79e+12
cos_cos_w_33 -4.705e+11 1.7e+12 -0.278 0.781 -3.79e+12 2.85e+12
sin_cos_w_33 -1.742e+12 1.66e+12 -1.048 0.295 -5e+12 1.52e+12
cos_sin_w_33 -1.742e+12 1.66e+12 -1.048 0.295 -5e+12 1.52e+12
sin_sin_w_34 2.033e+12 1.7e+12 1.196 0.232 -1.3e+12 5.36e+12
cos_cos_w_34 -2.033e+12 1.7e+12 -1.196 0.232 -5.36e+12 1.3e+12
sin_cos_w_34 8.048e+11 1.79e+12 0.449 0.653 -2.7e+12 4.31e+12
cos_sin_w_34 8.048e+11 1.79e+12 0.449 0.653 -2.7e+12 4.31e+12
sin_sin_w_35 2.365e+12 2.78e+12 0.851 0.395 -3.08e+12 7.81e+12
cos_cos_w_35 2.376e+12 2.55e+12 0.933 0.351 -2.62e+12 7.37e+12
sin_cos_w_35 5.073e+12 2.62e+12 1.935 0.053 -6.49e+10 1.02e+13
cos_sin_w_35 -5.111e+12 2.63e+12 -1.945 0.052 -1.03e+13 4e+10
sin_sin_w_36 1.932e+09 9e+09 0.215 0.830 -1.57e+10 1.96e+10
cos_cos_w_36 -1.932e+09 9e+09 -0.215 0.830 -1.96e+10 1.57e+10
sin_cos_w_36 -1.652e+12 1.12e+12 -1.476 0.140 -3.84e+12 5.42e+11
cos_sin_w_36 -1.652e+12 1.12e+12 -1.476 0.140 -3.84e+12 5.42e+11
sin_sin_w_37 1.046e+12 1.54e+12 0.681 0.496 -1.96e+12 4.05e+12
cos_cos_w_37 -1.046e+12 1.54e+12 -0.681 0.496 -4.05e+12 1.96e+12
sin_cos_w_37 -5.886e+11 1.44e+12 -0.408 0.684 -3.42e+12 2.24e+12
cos_sin_w_37 -5.886e+11 1.44e+12 -0.408 0.684 -3.42e+12 2.24e+12
sin_sin_w_38 1.106e+12 1.39e+12 0.794 0.427 -1.62e+12 3.84e+12
cos_cos_w_38 -1.106e+12 1.39e+12 -0.794 0.427 -3.84e+12 1.62e+12
sin_cos_w_38 -6.494e+11 1.35e+12 -0.481 0.631 -3.3e+12 2e+12
cos_sin_w_38 -6.494e+11 1.35e+12 -0.481 0.631 -3.3e+12 2e+12
sin_sin_w_39 1.158e+11 1.43e+12 0.081 0.935 -2.68e+12 2.91e+12
cos_cos_w_39 -1.158e+11 1.43e+12 -0.081 0.935 -2.91e+12 2.68e+12
sin_cos_w_39 -4.097e+11 1.42e+12 -0.288 0.774 -3.2e+12 2.38e+12
cos_sin_w_39 -4.097e+11 1.42e+12 -0.288 0.774 -3.2e+12 2.38e+12
sin_sin_w_40 1.536e+12 1.57e+12 0.979 0.328 -1.54e+12 4.61e+12
cos_cos_w_40 -1.536e+12 1.57e+12 -0.979 0.328 -4.61e+12 1.54e+12
sin_cos_w_40 1.236e+12 1.59e+12 0.779 0.436 -1.87e+12 4.34e+12
cos_sin_w_40 1.236e+12 1.59e+12 0.779 0.436 -1.87e+12 4.34e+12
sin_sin_w_41 -8.439e+11 1.19e+12 -0.711 0.477 -3.17e+12 1.48e+12
cos_cos_w_41 8.439e+11 1.19e+12 0.711 0.477 -1.48e+12 3.17e+12
sin_cos_w_41 -1.015e+12 1.15e+12 -0.883 0.377 -3.27e+12 1.24e+12
cos_sin_w_41 -1.015e+12 1.15e+12 -0.883 0.377 -3.27e+12 1.24e+12
sin_sin_w_42 1.584e+12 1.4e+12 1.134 0.257 -1.15e+12 4.32e+12
cos_cos_w_42 2.656e+12 6.47e+12 0.410 0.682 -1e+13 1.53e+13
sin_cos_w_42 -1.659e+12 7.51e+12 -0.221 0.825 -1.64e+13 1.31e+13
cos_sin_w_42 -1.273e+12 1.57e+12 -0.813 0.416 -4.34e+12 1.79e+12
sin_sin_w_43 8.684e+11 1.15e+12 0.757 0.449 -1.38e+12 3.12e+12
cos_cos_w_43 -8.684e+11 1.15e+12 -0.757 0.449 -3.12e+12 1.38e+12
sin_cos_w_43 -9.452e+11 1.16e+12 -0.818 0.413 -3.21e+12 1.32e+12
cos_sin_w_43 -9.452e+11 1.16e+12 -0.818 0.413 -3.21e+12 1.32e+12
sin_sin_w_44 -2.545e+11 1.1e+12 -0.232 0.817 -2.41e+12 1.9e+12
cos_cos_w_44 2.545e+11 1.1e+12 0.232 0.817 -1.9e+12 2.41e+12
sin_cos_w_44 -2.561e+12 1.03e+12 -2.483 0.013 -4.58e+12 -5.39e+11
cos_sin_w_44 -2.561e+12 1.03e+12 -2.483 0.013 -4.58e+12 -5.39e+11
sin_sin_w_45 -2.656e+12 1.22e+12 -2.185 0.029 -5.04e+12 -2.74e+11
cos_cos_w_45 2.656e+12 1.22e+12 2.185 0.029 2.74e+11 5.04e+12
sin_cos_w_45 7.165e+11 1.23e+12 0.581 0.561 -1.7e+12 3.13e+12
cos_sin_w_45 7.165e+11 1.23e+12 0.581 0.561 -1.7e+12 3.13e+12
sin_sin_w_46 -1.105e+12 1.09e+12 -1.013 0.311 -3.24e+12 1.03e+12
cos_cos_w_46 1.105e+12 1.09e+12 1.013 0.311 -1.03e+12 3.24e+12
sin_cos_w_46 -1.885e+12 1.07e+12 -1.769 0.077 -3.97e+12 2.03e+11
cos_sin_w_46 -1.885e+12 1.07e+12 -1.769 0.077 -3.97e+12 2.03e+11
sin_sin_w_47 -9.741e+11 1.23e+12 -0.794 0.427 -3.38e+12 1.43e+12
cos_cos_w_47 9.741e+11 1.23e+12 0.794 0.427 -1.43e+12 3.38e+12
sin_cos_w_47 -7.387e+10 1.09e+12 -0.068 0.946 -2.21e+12 2.07e+12
cos_sin_w_47 -7.387e+10 1.09e+12 -0.068 0.946 -2.21e+12 2.07e+12
sin_sin_w_48 5.307e+08 3.41e+09 0.156 0.876 -6.14e+09 7.2e+09
cos_cos_w_48 -5.307e+08 3.41e+09 -0.156 0.876 -7.2e+09 6.14e+09
sin_cos_w_48 6.905e+11 6.27e+11 1.102 0.270 -5.38e+11 1.92e+12
cos_sin_w_48 6.905e+11 6.27e+11 1.102 0.270 -5.38e+11 1.92e+12
sin_sin_w_49 -3.105e+12 1.54e+12 -2.012 0.044 -6.13e+12 -8.09e+10
cos_cos_w_49 2.476e+12 1.69e+12 1.463 0.143 -8.4e+11 5.79e+12
sin_cos_w_49 -2.137e+12 1.58e+12 -1.351 0.177 -5.24e+12 9.64e+11
cos_sin_w_49 -2.884e+12 1.52e+12 -1.904 0.057 -5.85e+12 8.55e+10
sin_sin_w_50 -1.425e+12 1.09e+12 -1.306 0.192 -3.56e+12 7.14e+11
cos_cos_w_50 1.425e+12 1.09e+12 1.306 0.192 -7.14e+11 3.56e+12
sin_cos_w_50 -8.047e+11 1.06e+12 -0.762 0.446 -2.87e+12 1.27e+12
cos_sin_w_50 -8.047e+11 1.06e+12 -0.762 0.446 -2.87e+12 1.27e+12
sin_sin_w_51 -8.61e+11 1.05e+12 -0.821 0.412 -2.92e+12 1.19e+12
cos_cos_w_51 8.61e+11 1.05e+12 0.821 0.412 -1.19e+12 2.92e+12
sin_cos_w_51 -1.535e+12 1.05e+12 -1.466 0.143 -3.59e+12 5.17e+11
cos_sin_w_51 -1.535e+12 1.05e+12 -1.466 0.143 -3.59e+12 5.17e+11
sin_sin_w_52 5.207e+11 1.06e+12 0.489 0.625 -1.57e+12 2.61e+12
cos_cos_w_52 -5.207e+11 1.06e+12 -0.489 0.625 -2.61e+12 1.57e+12
sin_cos_w_52 -2.003e+12 1.1e+12 -1.827 0.068 -4.15e+12 1.46e+11
cos_sin_w_52 -2.003e+12 1.1e+12 -1.827 0.068 -4.15e+12 1.46e+11
sin_sin_w_53 6.632e+11 1.11e+12 0.597 0.551 -1.51e+12 2.84e+12
cos_cos_w_53 -6.632e+11 1.11e+12 -0.597 0.551 -2.84e+12 1.51e+12
sin_cos_w_53 -8.888e+09 1.05e+12 -0.008 0.993 -2.06e+12 2.04e+12
cos_sin_w_53 -8.888e+09 1.05e+12 -0.008 0.993 -2.06e+12 2.04e+12
sin_sin_w_54 1.52e+11 9.46e+11 0.161 0.872 -1.7e+12 2.01e+12
cos_cos_w_54 -1.52e+11 9.46e+11 -0.161 0.872 -2.01e+12 1.7e+12
sin_cos_w_54 9.739e+11 9.8e+11 0.994 0.320 -9.46e+11 2.89e+12
cos_sin_w_54 9.739e+11 9.8e+11 0.994 0.320 -9.46e+11 2.89e+12
sin_sin_w_55 2.021e+12 1.03e+12 1.970 0.049 1.06e+10 4.03e+12
cos_cos_w_55 -2.021e+12 1.03e+12 -1.970 0.049 -4.03e+12 -1.06e+10
sin_cos_w_55 1.383e+12 9.77e+11 1.416 0.157 -5.32e+11 3.3e+12
cos_sin_w_55 1.383e+12 9.77e+11 1.416 0.157 -5.32e+11 3.3e+12
sin_sin_w_56 -3.348e+11 1.2e+12 -0.278 0.781 -2.69e+12 2.02e+12
cos_cos_w_56 8.229e+12 7.12e+12 1.156 0.248 -5.72e+12 2.22e+13
sin_cos_w_56 6.276e+12 7.04e+12 0.892 0.372 -7.51e+12 2.01e+13
cos_sin_w_56 -1.457e+12 1.16e+12 -1.253 0.210 -3.73e+12 8.21e+11
sin_sin_w_57 -2.758e+11 9.03e+11 -0.306 0.760 -2.04e+12 1.49e+12
cos_cos_w_57 2.758e+11 9.03e+11 0.306 0.760 -1.49e+12 2.04e+12
sin_cos_w_57 -2.196e+11 8.94e+11 -0.246 0.806 -1.97e+12 1.53e+12
cos_sin_w_57 -2.196e+11 8.94e+11 -0.246 0.806 -1.97e+12 1.53e+12
sin_sin_w_58 -1.201e+12 9.32e+11 -1.288 0.198 -3.03e+12 6.26e+11
cos_cos_w_58 1.201e+12 9.32e+11 1.288 0.198 -6.26e+11 3.03e+12
sin_cos_w_58 -1.815e+12 9.24e+11 -1.963 0.050 -3.63e+12 -3.04e+09
cos_sin_w_58 -1.815e+12 9.24e+11 -1.963 0.050 -3.63e+12 -3.04e+09
sin_sin_w_59 -6.001e+10 1e+12 -0.060 0.952 -2.03e+12 1.91e+12
cos_cos_w_59 6.001e+10 1e+12 0.060 0.952 -1.91e+12 2.03e+12
sin_cos_w_59 -6.546e+11 9.05e+11 -0.724 0.469 -2.43e+12 1.12e+12
cos_sin_w_59 -6.546e+11 9.05e+11 -0.724 0.469 -2.43e+12 1.12e+12
sin_sin_w_60 -5.265e+09 4.08e+09 -1.290 0.197 -1.33e+10 2.73e+09
cos_cos_w_60 5.265e+09 4.08e+09 1.290 0.197 -2.73e+09 1.33e+10
sin_cos_w_60 4.067e+11 5.93e+11 0.686 0.493 -7.55e+11 1.57e+12
cos_sin_w_60 4.067e+11 5.93e+11 0.686 0.493 -7.55e+11 1.57e+12
sin_sin_w_61 4.182e+11 9.58e+11 0.437 0.662 -1.46e+12 2.3e+12
cos_cos_w_61 -4.182e+11 9.58e+11 -0.437 0.662 -2.3e+12 1.46e+12
sin_cos_w_61 -4.249e+11 9.03e+11 -0.470 0.638 -2.2e+12 1.35e+12
cos_sin_w_61 -4.249e+11 9.03e+11 -0.470 0.638 -2.2e+12 1.35e+12
sin_sin_w_62 -1.745e+12 9.29e+11 -1.879 0.060 -3.57e+12 7.55e+10
cos_cos_w_62 1.745e+12 9.29e+11 1.879 0.060 -7.55e+10 3.57e+12
sin_cos_w_62 1.189e+12 8.44e+11 1.408 0.159 -4.66e+11 2.84e+12
cos_sin_w_62 1.189e+12 8.44e+11 1.408 0.159 -4.66e+11 2.84e+12
sin_sin_w_63 2.506e+12 1.38e+12 1.822 0.068 -1.89e+11 5.2e+12
cos_cos_w_63 -2.904e+11 1.38e+12 -0.210 0.834 -3e+12 2.42e+12
sin_cos_w_63 6.027e+11 1.36e+12 0.444 0.657 -2.06e+12 3.26e+12
cos_sin_w_63 -1.464e+11 1.37e+12 -0.107 0.915 -2.84e+12 2.54e+12
sin_sin_w_64 7.059e+11 9.54e+11 0.740 0.459 -1.16e+12 2.58e+12
cos_cos_w_64 -7.059e+11 9.54e+11 -0.740 0.459 -2.58e+12 1.16e+12
sin_cos_w_64 6.888e+10 9.28e+11 0.074 0.941 -1.75e+12 1.89e+12
cos_sin_w_64 6.888e+10 9.28e+11 0.074 0.941 -1.75e+12 1.89e+12
sin_sin_w_65 4.503e+11 8.29e+11 0.543 0.587 -1.18e+12 2.08e+12
cos_cos_w_65 -4.503e+11 8.29e+11 -0.543 0.587 -2.08e+12 1.18e+12
sin_cos_w_65 4.877e+11 7.51e+11 0.649 0.516 -9.85e+11 1.96e+12
cos_sin_w_65 4.877e+11 7.51e+11 0.649 0.516 -9.85e+11 1.96e+12
sin_sin_w_66 1.123e+11 8.48e+11 0.132 0.895 -1.55e+12 1.77e+12
cos_cos_w_66 -1.123e+11 8.48e+11 -0.132 0.895 -1.77e+12 1.55e+12
sin_cos_w_66 1.953e+11 8.65e+11 0.226 0.821 -1.5e+12 1.89e+12
cos_sin_w_66 1.953e+11 8.65e+11 0.226 0.821 -1.5e+12 1.89e+12
sin_sin_w_67 -9.844e+11 8.72e+11 -1.129 0.259 -2.69e+12 7.24e+11
cos_cos_w_67 9.844e+11 8.72e+11 1.129 0.259 -7.24e+11 2.69e+12
sin_cos_w_67 -5.051e+11 8.37e+11 -0.604 0.546 -2.15e+12 1.14e+12
cos_sin_w_67 -5.051e+11 8.37e+11 -0.604 0.546 -2.15e+12 1.14e+12
sin_sin_w_68 -9.133e+11 8.94e+11 -1.022 0.307 -2.67e+12 8.39e+11
cos_cos_w_68 9.133e+11 8.94e+11 1.022 0.307 -8.39e+11 2.67e+12
sin_cos_w_68 -5.978e+11 8.67e+11 -0.689 0.491 -2.3e+12 1.1e+12
cos_sin_w_68 -5.978e+11 8.67e+11 -0.689 0.491 -2.3e+12 1.1e+12
sin_sin_w_69 -2.643e+11 8.4e+11 -0.315 0.753 -1.91e+12 1.38e+12
cos_cos_w_69 2.643e+11 8.4e+11 0.315 0.753 -1.38e+12 1.91e+12
sin_cos_w_69 -3.164e+11 8.59e+11 -0.368 0.713 -2e+12 1.37e+12
cos_sin_w_69 -3.164e+11 8.59e+11 -0.368 0.713 -2e+12 1.37e+12
sin_sin_w_70 -1.794e+12 1.11e+12 -1.610 0.107 -3.98e+12 3.9e+11
cos_cos_w_70 -3.552e+12 5.83e+12 -0.609 0.542 -1.5e+13 7.87e+12
sin_cos_w_70 2.844e+12 5.8e+12 0.491 0.624 -8.52e+12 1.42e+13
cos_sin_w_70 -5.633e+11 1.07e+12 -0.525 0.599 -2.66e+12 1.54e+12
sin_sin_w_71 -8.472e+11 9.25e+11 -0.916 0.360 -2.66e+12 9.66e+11
cos_cos_w_71 8.472e+11 9.25e+11 0.916 0.360 -9.66e+11 2.66e+12
sin_cos_w_71 -5.388e+11 8.19e+11 -0.658 0.511 -2.14e+12 1.07e+12
cos_sin_w_71 -5.388e+11 8.19e+11 -0.658 0.511 -2.14e+12 1.07e+12
sin_sin_w_72 -5.649e+09 4e+09 -1.413 0.158 -1.35e+10 2.19e+09
cos_cos_w_72 5.649e+09 4e+09 1.413 0.158 -2.19e+09 1.35e+10
sin_cos_w_72 2.29e+12 5.62e+11 4.075 0.000 1.19e+12 3.39e+12
cos_sin_w_72 2.29e+12 5.62e+11 4.075 0.000 1.19e+12 3.39e+12
sin_sin_w_73 -2.216e+12 8.25e+11 -2.685 0.007 -3.83e+12 -5.98e+11
cos_cos_w_73 2.216e+12 8.25e+11 2.685 0.007 5.98e+11 3.83e+12
sin_cos_w_73 -1.02e+12 7.54e+11 -1.353 0.176 -2.5e+12 4.57e+11
cos_sin_w_73 -1.02e+12 7.54e+11 -1.353 0.176 -2.5e+12 4.57e+11
sin_sin_w_74 2.611e+11 8.17e+11 0.320 0.749 -1.34e+12 1.86e+12
cos_cos_w_74 -2.611e+11 8.17e+11 -0.320 0.749 -1.86e+12 1.34e+12
sin_cos_w_74 4.686e+11 7.55e+11 0.621 0.535 -1.01e+12 1.95e+12
cos_sin_w_74 4.686e+11 7.55e+11 0.621 0.535 -1.01e+12 1.95e+12
sin_sin_w_75 -8.975e+11 7.44e+11 -1.206 0.228 -2.36e+12 5.61e+11
cos_cos_w_75 8.975e+11 7.44e+11 1.206 0.228 -5.61e+11 2.36e+12
sin_cos_w_75 6.039e+11 7.79e+11 0.775 0.438 -9.24e+11 2.13e+12
cos_sin_w_75 6.039e+11 7.79e+11 0.775 0.438 -9.24e+11 2.13e+12
sin_sin_w_76 7.713e+11 7.04e+11 1.096 0.273 -6.08e+11 2.15e+12
cos_cos_w_76 -7.713e+11 7.04e+11 -1.096 0.273 -2.15e+12 6.08e+11
sin_cos_w_76 -4.766e+09 7e+11 -0.007 0.995 -1.38e+12 1.37e+12
cos_sin_w_76 -4.766e+09 7e+11 -0.007 0.995 -1.38e+12 1.37e+12
sin_sin_w_77 5.227e+11 1.2e+12 0.436 0.662 -1.82e+12 2.87e+12
cos_cos_w_77 1.158e+12 1.11e+12 1.047 0.295 -1.01e+12 3.33e+12
sin_cos_w_77 -5.225e+11 1.18e+12 -0.444 0.657 -2.83e+12 1.78e+12
cos_sin_w_77 2.879e+10 1.1e+12 0.026 0.979 -2.12e+12 2.18e+12
sin_sin_w_78 1.089e+12 7.18e+11 1.517 0.129 -3.18e+11 2.5e+12
cos_cos_w_78 -1.089e+12 7.18e+11 -1.517 0.129 -2.5e+12 3.18e+11
sin_cos_w_78 -2.705e+11 7.37e+11 -0.367 0.714 -1.72e+12 1.17e+12
cos_sin_w_78 -2.705e+11 7.37e+11 -0.367 0.714 -1.72e+12 1.17e+12
sin_sin_w_79 -5.987e+11 6.96e+11 -0.860 0.390 -1.96e+12 7.66e+11
cos_cos_w_79 5.987e+11 6.96e+11 0.860 0.390 -7.66e+11 1.96e+12
sin_cos_w_79 4.181e+11 6.69e+11 0.625 0.532 -8.93e+11 1.73e+12
cos_sin_w_79 4.181e+11 6.69e+11 0.625 0.532 -8.93e+11 1.73e+12
sin_sin_w_80 1.306e+12 9.47e+11 1.379 0.168 -5.5e+11 3.16e+12
cos_cos_w_80 -1.306e+12 9.47e+11 -1.379 0.168 -3.16e+12 5.5e+11
sin_cos_w_80 -1.041e+12 9.08e+11 -1.146 0.252 -2.82e+12 7.39e+11
cos_sin_w_80 -1.041e+12 9.08e+11 -1.146 0.252 -2.82e+12 7.39e+11
sin_sin_w_81 2.34e+11 6.26e+11 0.374 0.708 -9.92e+11 1.46e+12
cos_cos_w_81 -2.34e+11 6.26e+11 -0.374 0.708 -1.46e+12 9.92e+11
sin_cos_w_81 -5.534e+11 6.73e+11 -0.823 0.411 -1.87e+12 7.65e+11
cos_sin_w_81 -5.534e+11 6.73e+11 -0.823 0.411 -1.87e+12 7.65e+11
Omnibus: 2091.531 Durbin-Watson: 0.387
Prob(Omnibus): 0.000 Jarque-Bera (JB): 29768.867
Skew: -1.933 Prob(JB): 0.00
Kurtosis: 15.228 Cond. No. 1.16e+16


Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)
[2] The smallest eigenvalue is 8.06e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
function to get the regression prediction
In [32]:
def prediction(model, components, feature_num, dataframe):
    coefs = np.array([model.params[coef] for coef in model.params.index])
    fourier_components = np.hstack((np.array([1]*dataframe.shape[0]).reshape(dataframe.shape[0],1), 
                                    dataframe.iloc[:, 1:feature_num*components+1].values))
    return coefs.dot(fourier_components.T)
In [33]:
#model = models[40]
pred = prediction(models[80], components=81, feature_num=4, dataframe=regression_taxi)
In [11]:
def prediction_real(df_1, df_2, segment, title, ylabel, size, labels):
    plt.figure(figsize=size)
    plt.plot(df_1[segment[0]: segment[1]], alpha = 0.7, label = labels[0])
    plt.plot(df_2[segment[0]: segment[1]], alpha=0.5, label = labels[1], color="red")
    plt.xlabel('time (hours)')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.grid()
In [56]:
prediction_real(pred, regression_taxi.taxi_call_num.values, (0, 745), "January", 'taxi calls', (15, 5), ("predict", "real"))
In [57]:
prediction_real(pred, regression_taxi.taxi_call_num.values, (0, 4344), "six months", 'taxi calls', (50, 5), ("predict", "real"))

#double click makes plot langer
resids of the model. we make a plot to compare residuals of the model and real taxi values to find is it any significant structure in residual plot.
In [60]:
prediction_real(regression_taxi.taxi_call_num.values, models[80].resid.values, (0, 745), "first january week", 'residuals', 
                (30, 5), ("real values", "residuals"))
In [59]:
prediction_real(regression_taxi.taxi_call_num.values, models[80].resid.values, (0, 168), "first january week", 'residuals', 
                (15, 5), ("real values", "residuals"))
not accounted structure stay in residual plot
In [61]:
plt.figure(figsize(80,10))
sm.tsa.seasonal_decompose(models[80].resid).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(models[80].resid)[1])
Критерий Дики-Фуллера: p=0.000000
<Figure size 5760x720 with 0 Axes>
In [92]:
#sm.tsa.seasonal_decompose(models[80].resid).__dict__.keys()
#plt.plot(sm.tsa.seasonal_decompose(models[80].resid).__dict__["resid"][:96])
as linear regression features we choose 80 components of fourier serie
$$sin(2*{\pi}*k_f *time/168)*sin(2*{\pi}*k_f *time/28)$$ $$cos(2*{\pi}*k_f *time/168)*sin(2*{\pi}*k_f *time/28)$$ $$sin(2*{\pi}*k_f *time/168)*cos(2*{\pi}*k_f *time/28)$$ $$cos(2*{\pi}*k_f *time/168)*cos(2*{\pi}*k_f *time/28)$$
In [62]:
regression_taxi.iloc[:, :321].head()
Out[62]:
taxi_call_num sin_sin_w_1 cos_cos_w_1 sin_cos_w_1 cos_sin_w_1 sin_sin_w_2 cos_cos_w_2 sin_cos_w_2 cos_sin_w_2 sin_sin_w_3 ... sin_cos_w_78 cos_sin_w_78 sin_sin_w_79 cos_cos_w_79 sin_cos_w_79 cos_sin_w_79 sin_sin_w_80 cos_cos_w_80 sin_cos_w_80 cos_sin_w_80
2015-01-01 00:00:00 1125.0 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000
2015-01-01 01:00:00 922.0 0.008320 0.974246 0.036454 0.222365 0.032424 0.898450 0.067329 0.432671 0.069809 ... 0.049516 0.950484 -0.167501 -0.426320 0.080664 0.885262 -0.116526 -0.616526 0.092926 0.773099
2015-01-01 02:00:00 767.0 0.032424 0.898450 0.067329 0.432671 0.116526 0.616526 0.092926 0.773099 0.216942 ... 0.390916 -0.390916 0.285635 -0.580390 0.227786 -0.727786 0.287365 -0.212635 0.065589 -0.931615
2015-01-01 03:00:00 774.0 0.069809 0.776915 0.087537 0.619569 0.216942 0.216942 0.049516 0.950484 0.297571 ... -0.388740 -0.611260 0.118388 0.825495 -0.518693 -0.188414 -0.188255 0.811745 -0.390916 0.390916
2015-01-01 04:00:00 548.0 0.116526 0.616526 0.092926 0.773099 0.287365 -0.212635 -0.065589 0.931615 0.188255 ... -0.487464 0.487464 -0.663119 -0.163119 0.151353 0.714673 -0.244415 -0.744415 0.507534 0.358492

5 rows × 321 columns

In [66]:
full_regres_taxi = regression_taxi.iloc[:, :321]
full_regres_taxi.shape
Out[66]:
(4344, 321)
its possible to introduce additional feature to consider snow storm in january 26-27. But this will not help us in making future prediction.
In [67]:
#full_regres_taxi = full_regres_taxi.drop(['jan_anom'], axis=1)
full_regres_taxi['jan_anom'] = [full_regres_taxi.loc[date].taxi_call_num if date.month == 1 and date.day in [27] and date.year == 2015 else 0 for date in full_regres_taxi.index]
full_regres_taxi.shape
Out[67]:
(4344, 322)
In [69]:
m1 = smf.ols('taxi_call_num ~ ' + ' + '.join(list(full_regres_taxi.columns)[1:]) + ' + jan_anom', data=full_regres_taxi)
fin_model_anomal = m1.fit(cov_type='HC1')
In [70]:
#result = prediction(fin_model_anomal, components=81, feature_num=4, dataframe=regression_taxi)

coefs = np.array([fin_model_anomal.params[coef] for coef in fin_model_anomal.params.index])
fourier_components = np.hstack((np.array([1]*full_regres_taxi.shape[0]).reshape(full_regres_taxi.shape[0],1), 
                                    full_regres_taxi.iloc[:, 1:4*80+2].values))
result = coefs.dot(fourier_components.T)
In [71]:
prediction_real(regression_taxi.taxi_call_num.values, result, (744, 1416), "february", 'taxi calls', 
                (15, 5), ("real values", "prediction"))

Now we are going to select appropriative ARIMA model for residual signal - data not encountered by linear regression model

In [74]:
def plot_autocorr(resid_data):
    plt.figure(figsize(15,10))
    ax = plt.subplot(211)
    sm.graphics.tsa.plot_acf(resid_data.squeeze(), lags=168, ax=ax)
    pylab.show()
    ax = plt.subplot(212)
    sm.graphics.tsa.plot_pacf(resid_data.squeeze(), lags=168, ax=ax)
    pylab.show()   
In [75]:
plot_autocorr(fin_model_anomal.resid.values)
single ordinary differentiation of the time series
In [77]:
full_regres_taxi["residuals"] = fin_model_anomal.resid
full_regres_taxi["residual_diff1"] = full_regres_taxi["residuals"] - full_regres_taxi["residuals"].shift(1)
In [78]:
plot_autocorr(full_regres_taxi["residual_diff1"][1:].values)
select search parameters from autocorrelation and partial autocorrelation plots (autocorrelation - q, Q)
In [41]:
ps = range(0, 12)
qs = range(0, 6)
Ps = range(0, 2)
Qs = range(0, 2)
In [42]:
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
Out[42]:
288
In [52]:
%%time
residuals_results = []
residuals_best_aic = float("inf")
warnings.filterwarnings('ignore')
resid_wrong_param = 0

for counter, param in enumerate(parameters_list):
    if counter % 10 == 0:
        print(counter, end=' ')
    #some of the parameters set are incompatible
    try:
        model=sm.tsa.statespace.SARIMAX(full_regres_taxi["residuals"], order=(param[0], 1, param[1]), 
                                        seasonal_order=(param[2], 0, param[3], 24)).fit(disp=-1)
    #print failure parameters and continue
    except LinAlgError:
        print("singular matrix: ", param)
        continue
    except ValueError:
        resid_wrong_param += 1
        continue
    aic = model.aic
    #save best model, aic, parameters 
    if aic < residuals_best_aic:
        residuals_best_model = model
        residuals_best_aic = aic
        residuals_best_param = param
    residuals_results.append([param, model.aic])
    
warnings.filterwarnings('default')
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 singular matrix:  (8, 5, 0, 1)
220 230 240 250 260 270 280 Wall time: 1h 50min 56s
In [53]:
print(sorted(residuals_results, key=lambda tup: tup[1], reverse=False)[:5])
[[(11, 1, 1, 1), 49350.31255143054], [(11, 2, 1, 1), 49367.32498883665], [(10, 1, 1, 1), 49370.80657227062], [(11, 5, 1, 1), 49372.33337384747], [(11, 3, 1, 1), 49372.341106499065]]
In [54]:
plt.figure(figsize(15,12))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(residuals_best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(residuals_best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
residual signal from best selected ARIMA model looks completely decorrelated
In [79]:
best_arima = sm.tsa.statespace.SARIMAX(full_regres_taxi["residuals"], order=(11, 1, 1), seasonal_order=(1, 0, 1, 24)).fit(disp=-1)
In [55]:
plt.figure(figsize(15,8))
plt.subplot(211)
residuals_best_model.resid[1:].plot()
plt.ylabel(u'Residuals')

ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(residuals_best_model.resid[1:].values.squeeze(), lags=168, ax=ax)

print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(residuals_best_model.resid[1:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(residuals_best_model.resid[1:])[1])
Критерий Стьюдента: p=0.991722
Критерий Дики-Фуллера: p=0.000000
In [80]:
prediction_real(full_regres_taxi["residuals"], best_arima.fittedvalues, (744, 1416), "february", 'taxi calls', 
                (15, 5), ("real residuals", "arima fitted values"))
In [81]:
prediction_real(full_regres_taxi["residuals"], best_arima.fittedvalues, (3624, 4344), "june", 'taxi calls', 
                (15, 5), ("real residuals", "arima fitted values"))
ARIMA fitted values looks good
In [82]:
best_arima.fittedvalues.shape
Out[82]:
(4344,)
add ARIMA with parameters (p, q, P, Q) = (11, 1, 1, 1) fitted values to values predicted by linear regression model and get final result
In [83]:
final_res = result + best_arima.fittedvalues
plot final fit for february and june mounth
In [85]:
prediction_real(regression_taxi.taxi_call_num.values, final_res.values, (3624, 4344), "june", 'taxi calls', 
                (15, 5), ("real taxi calls", "final prediction"))
In [86]:
prediction_real(regression_taxi.taxi_call_num.values, final_res.values, (744, 1416), "february", 'taxi calls', 
                (15, 5), ("real taxi calls", "final prediction"))
In [ ]:
 

Try to fit data using only ARIMA model without linear regression on fourier features

look at ACF и PACF of time serie:
In [6]:
diff_data=pd.DataFrame(taxi_data.taxi_call_num, columns=["taxi_data.taxi_call_num"])
In [88]:
plot_autocorr(taxi_data.taxi_call_num.values)
performe ordinary differentiation
In [7]:
diff_data["diff_168"] = taxi_data.taxi_call_num - taxi_data.taxi_call_num.shift(168)
In [90]:
plot_autocorr(diff_data["diff_168"][168:].values)
performe seasonal differentiation with weekly period (168 hours)
In [8]:
diff_data["diff_168_1"] = diff_data["diff_168"][168:] - diff_data["diff_168"][168:].shift(1)
In [92]:
plot_autocorr(diff_data["diff_168_1"][169:].values)
select search parameters from autocorrelation and partial autocorrelation plots (autocorrelation - q, Q)
Q=1, q=5, P=1, p=11
In [26]:
ps = range(0, 12)
qs = range(0, 6)
Ps = range(0, 2)
Qs = range(0, 2)
In [27]:
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
Out[27]:
288
In [28]:
%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
wrong_parameters_counter = 0

for counter, param in enumerate(parameters_list):
    if counter % 10 == 0:
        print(counter, end=' ')
    #some of the parameters set are incompatible
    try:
        model=sm.tsa.statespace.SARIMAX(taxi_data.taxi_call_num, order=(param[0], 1, param[1]), 
                                        seasonal_order=(param[2], 1, param[3], 24)).fit(disp=-1)
    #print failure parameters and continue
    except LinAlgError:
        print("singular matrix: ", param)
        continue
    except ValueError:
        wrong_parameters_counter += 1
        continue
    aic = model.aic
    #save best model, aic, parameters 
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])
    
warnings.filterwarnings('default')
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 Wall time: 7h 34min 23s
In [29]:
print(wrong_parameters_counter)
97
In [30]:
print(sorted(results, key=lambda tup: tup[1], reverse=False)[:5])
[[(9, 1, 1, 1), 51221.68728769896], [(8, 1, 1, 1), 51301.58797393386], [(9, 2, 1, 1), 51310.93349961903], [(7, 1, 1, 1), 51351.98252731767], [(8, 2, 1, 1), 51360.55513375569]]
look at residuals of best selected SARIMA model
In [31]:
plt.figure(figsize(15,12))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(best_model.resid.values.squeeze(), lags=168, ax=ax)
pylab.show()
In [32]:
plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[1:].plot()
plt.ylabel(u'Residuals')

ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[1:].values.squeeze(), lags=168, ax=ax)

print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[1:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[1:])[1])
Критерий Стьюдента: p=0.479122
Критерий Дики-Фуллера: p=0.000000

Согласно критерию Стьюдента - остатки не смещены

In [9]:
best_arima_1 = sm.tsa.statespace.SARIMAX(taxi_data.taxi_call_num, order=(9, 1, 1), 
                                        seasonal_order=(1, 1, 1, 24)).fit(disp=-1)
plot final fit for february and june mounth
In [15]:
prediction_real(regression_taxi.taxi_call_num.values, best_arima_1.fittedvalues.values, (3624, 4344), "june", 'taxi calls', 
                (15, 5), ("real taxi calls", "final SARIMA fitted"))
In [16]:
prediction_real(regression_taxi.taxi_call_num.values, best_arima_1.fittedvalues.values, (744, 1416), "february", 'taxi calls', 
                (15, 5), ("real taxi calls", "final SARIMA fitted"))
Using only SARIMA (parameters (p, q, P, Q) = (9, 1, 1, 1)) prediction model with one ordinary and one seasonal (168 hour) differentiation we got visualy good fitted values for time serie. SARIMA is very time and resource consuming especially with seasonal differentiation. It give good fitted results but is pure for prediction of such taxi trips time serie.
In [ ]: